setwd('/Users/Panda/Desktop')
data =read.csv('USvideos_cleaned.csv',header=T)
library(corrplot)
## corrplot 0.84 loaded
library(ggplot2)
corrplot.mixed(corr = cor(data[,c("category_id","views","likes","dislikes","comment_count")]))
We have high correlation between views & likes, likes & comment_count, dislikes & comment_count.
ggplot(data,aes(x=views,y=likes,colour=likes,size=likes)) + geom_jitter() +
geom_smooth()+guides(fill="none")+ labs(title="Views Vs Likes")+
theme(legend.position = "none")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data,aes(x=views,y=dislikes,colour=dislikes,size=dislikes)) + geom_jitter() +
geom_smooth()+guides(fill="none")+ labs(title="Views Vs Dislikes")+
theme(legend.position = "none")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data,aes(x=likes,y=comment_count,colour=comment_count,size=comment_count)) + geom_jitter() +
geom_smooth()+guides(fill="none")+ labs(title="Likes Vs Comments")+
theme(legend.position = "none")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data,aes(x=dislikes,y=comment_count,colour=comment_count,size=comment_count)) + geom_jitter() +
geom_smooth()+guides(fill="none")+ labs(title="Dislikes Vs Comments")+
theme(legend.position = "none")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Most_Viewed_Videos = data %>% select(title,channel_title,views,likes,category_id,comment_count) %>%
distinct(title,.keep_all = TRUE) %>%mutate(Title = title)%>% arrange(desc(views))%>% head(10)
Most_Viewed_Videos$title = c("El Prestamo by Maluma","Fake Love official by BTS","What is Love by TWICE","Avengers:Infinity War Trailer","Perfect by Ed Sheeran","This is America by Childish Gambino","VENOM Official Trailer","The Shape of 2017 - Youtube Rewind","Nice for What by Drake","Sanju Official Trailer")
ggplot(Most_Viewed_Videos,aes(x=reorder(title,-views),y=views))+
geom_bar(stat = "identity",aes(fill=reorder(title,-views)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))
Most_Liked_Videos = data %>% select(title,channel_title,likes) %>% distinct(title,.keep_all = TRUE) %>%
mutate(Title = title)%>%arrange(desc(likes))%>% head(10)
Most_Liked_Videos$title = c("Fake Love official by BTS","MIC Drop by BTS","Daydream by j-hope","Love Yourself by BTS","Avengers:Infinity War Trailer","Perfect by Ed Sheeran","Euphoria by BTS","Fake Love Extended by BTS","This is America by Childish Gambino","Airplane by j-hope")
ggplot(Most_Liked_Videos,aes(x=reorder(title,-likes),y=likes))+
geom_bar(stat = "identity",aes(fill=reorder(title,-likes)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))
Most_Disliked_Videos = data %>% select(title,channel_title,dislikes) %>% distinct(title,.keep_all = TRUE) %>%
mutate(Title = title)%>%arrange(desc(dislikes))%>% head(10)
Most_Disliked_Videos$title = c("So Sorry by Logan Paul","The Shape of 2017 by Youtube Spotlight","Logan Paul is Back by Logan Paul","PSA from Chairman of the FCC Ajit Pai by Daily Caller","Black Ops 4 Multiplayer Reveal Trailer by Call of Duty","Suicide: Be Here Tomorrow. by Logan Paul","Fergie Performs The U.S. National Anthem by MLG Highlights","The FCC repeals its net neutrality rules by Washington Post","What is Love MV by Twice","Santa Diss Track by Logan Paul")
ggplot(Most_Disliked_Videos,aes(x=reorder(title,-dislikes),y=dislikes))+
geom_bar(stat = "identity",aes(fill=reorder(title,-dislikes)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))
Most_Commented_Videos = data %>% select(title,channel_title,comment_count) %>% distinct(title,.keep_all = TRUE) %>%
mutate(Title = title)%>%arrange(desc(comment_count))%>% head(10)
Most_Commented_Videos$title = c("So Sorry by Logan Paul","Fake Love official by BTS","The Shape of 2017 by Youtube Spotlight", "Logan Paul is Back by Logan Paul","MIC Drop by BTS","Daydream by j-hope","Suicide: Be Here Tomorrow. by Logan Paul","Melting Every Lipstick by Safiya Nygaard", "Love Yourself by BTS","Avengers:Infinity War Trailer")
ggplot(Most_Commented_Videos,aes(x=reorder(title,comment_count),y=comment_count))+
geom_bar(stat = "identity",aes(fill=reorder(title,-comment_count)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))
videos = data[,-1]
for (i in c(7,8,9,10)){
videos[,i] <- as.numeric(as.character(videos[,i]))
}
topChannelAveView = videos %>% group_by(channel_title) %>% summarise(view=sum(views),count=n(),Average_View = sum(views)/n()) %>% arrange(desc(Average_View)) %>%top_n(10)
## Selecting by Average_View
ggplot(topChannelAveView,aes(x=reorder(channel_title,-Average_View),y=Average_View))+
geom_bar(stat = "identity",aes(fill=reorder(channel_title,-Average_View)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))
topChannelDis = videos %>% group_by(channel_title) %>% summarise(dislike=sum(dislikes),count=n(),Average_Dis = sum(dislikes)/n()) %>% arrange(desc(Average_Dis))%>%top_n(10)
## Selecting by Average_Dis
ggplot(topChannelDis,aes(x=reorder(channel_title,-Average_Dis),y=Average_Dis))+
geom_bar(stat = "identity",aes(fill=reorder(channel_title,-Average_Dis)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))
topChannelLike = videos %>% group_by(channel_title) %>% summarise(like=sum(likes),count=n(),Average_like = sum(likes)/n()) %>% arrange(desc(Average_like))%>%top_n(10)
## Selecting by Average_like
ggplot(topChannelLike,aes(x=reorder(channel_title,-Average_like),y=Average_like))+
geom_bar(stat = "identity",aes(fill=reorder(channel_title,-Average_like)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))
topChannelComment= videos %>% group_by(channel_title) %>% summarise(comment=sum(comment_count),count=n(),Average_comment = sum(comment_count)/n()) %>% arrange(desc(Average_comment))%>%top_n(10)
## Selecting by Average_comment
ggplot(topChannelComment,aes(x=reorder(channel_title,-Average_comment),y=Average_comment))+
geom_bar(stat = "identity",aes(fill=reorder(channel_title,-Average_comment)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))
topCateAveView = videos %>% group_by(category_id) %>% summarise(view=sum(views),count=n(),Average_View = sum(views)/n()) %>% arrange(desc(Average_View)) %>%top_n(10)
## Selecting by Average_View
topCateAveView$category = c('Music','Film & Animation','Nonprofits & Activism','Gaming','Entertainment','Sports','People & Blogs','Comedy','Science & Technology','Autos & Vehicles')
ggplot(topCateAveView,aes(x=reorder(category,-Average_View),y=Average_View))+
geom_bar(stat = "identity",aes(fill=reorder(category,-Average_View)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))
topCateDis = videos %>% group_by(category_id) %>% summarise(dislike=sum(dislikes),count=n(),Average_Dis = sum(dislikes)/n()) %>% arrange(desc(Average_Dis))%>%top_n(10)
## Selecting by Average_Dis
topCateDis$category = c('Nonprofits & Activism','Gaming','Music','Entertainment','People & Blogs','Film & Animation','Sports','Comedy','Science & Technology','News & Politics')
ggplot(topCateDis,aes(x=reorder(category,-Average_Dis),y=Average_Dis))+
geom_bar(stat = "identity",aes(fill=reorder(category,-Average_Dis)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))
topCateLike = videos %>% group_by(category_id) %>% summarise(like=sum(likes),count=n(),Average_like = sum(likes)/n()) %>% arrange(desc(Average_like))%>%top_n(10)
## Selecting by Average_like
topCateLike$category = c('Nonprofits & Activism','Music','Gaming','Film & Animation','Comedy','People & Blogs','Entertainment','Sports', 'Howto & Style','Science & Technology')
ggplot(topCateLike,aes(x=reorder(category,-Average_like),y=Average_like))+
geom_bar(stat = "identity",aes(fill=reorder(category,-Average_like)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))
topCateComment= videos %>% group_by(category_id) %>% summarise(comment=sum(comment_count),count=n(),Average_comment = sum(comment_count)/n()) %>% arrange(desc(Average_comment))%>%top_n(10)
## Selecting by Average_comment
topCateComment$category = c('Nonprofits & Activism','Music','Gaming','People & Blogs','Film & Animation','Entertainment','Comedy','Howto & Style','Sports','Science & Technology')
ggplot(topCateComment,aes(x=reorder(category,-Average_comment),y=Average_comment))+
geom_bar(stat ="identity",aes(fill=reorder(category,-Average_comment)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))
videos %>% group_by(channel_title) %>% summarise(view=sum(views),count=n(),Average_View = sum(views)/n()) %>% arrange(desc(view))%>%top_n(10)
## Selecting by Average_View
## # A tibble: 10 x 4
## channel_title view count Average_View
## <fct> <dbl> <int> <dbl>
## 1 ChildishGambinoVEVO 3758488765 25 150339551.
## 2 ArianaGrandeVevo 1576959172 43 36673469.
## 3 MalumaVEVO 1551515831 32 48484870.
## 4 FoxStarHindi 1238609854 32 38706558.
## 5 BeckyGVEVO 1182971286 20 59148564.
## 6 YouTube Spotlight 791388476 18 43966026.
## 7 TheWeekndVEVO 778810304 19 40990016
## 8 DrakeVEVO 583521598 14 41680114.
## 9 LuisFonsiVEVO 534738794 10 53473879.
## 10 Kylie Jenner 461064419 10 46106442.
videos %>% group_by(channel_title) %>% summarise(dislike=sum(dislikes),count=n(),Average_Dis = sum(dislikes)/n()) %>% arrange(desc(dislike))%>%top_n(10)
## Selecting by Average_Dis
## # A tibble: 10 x 4
## channel_title dislike count Average_Dis
## <fct> <dbl> <int> <dbl>
## 1 Logan Paul Vlogs 13847251 24 576969.
## 2 YouTube Spotlight 10924092 18 606894
## 3 ChildishGambinoVEVO 6054434 25 242177.
## 4 Call of Duty 5644083 41 137661.
## 5 KatyPerryVEVO 1669622 18 92757.
## 6 BeckyGVEVO 1616616 20 80831.
## 7 Erika Costell 1401810 15 93454
## 8 David Dobrik 802335 9 89148.
## 9 shakiraVEVO 704716 7 100674.
## 10 Daily Caller 671419 3 223806.
videos %>% group_by(channel_title) %>% summarise(like=sum(likes),count=n(),Average_Like = sum(likes)/n()) %>% arrange(desc(like))%>%top_n(10)
## Selecting by Average_Like
## # A tibble: 10 x 4
## channel_title like count Average_Like
## <fct> <dbl> <int> <dbl>
## 1 ibighit 199247121 80 2490589.
## 2 ChildishGambinoVEVO 96700818 25 3868033.
## 3 ArianaGrandeVevo 52170970 43 1213278.
## 4 Logan Paul Vlogs 31545290 24 1314387.
## 5 YouTube Spotlight 20173324 18 1120740.
## 6 BeckyGVEVO 19185287 20 959264.
## 7 TheWeekndVEVO 16996782 19 894567.
## 8 LuisFonsiVEVO 16671426 10 1667143.
## 9 David Dobrik 16537616 9 1837513.
## 10 Desimpedidos 3052417 3 1017472.
videos %>% group_by(channel_title) %>% summarise(comment=sum(comment_count),count=n(),Average_comment = sum(comment_count)/n()) %>% arrange(desc(comment))%>%top_n(10)
## Selecting by Average_comment
## # A tibble: 10 x 4
## channel_title comment count Average_comment
## <fct> <dbl> <int> <dbl>
## 1 ibighit 31817464 80 397718.
## 2 Logan Paul Vlogs 14870370 24 619599.
## 3 ChildishGambinoVEVO 10151289 25 406052.
## 4 jypentertainment 7575510 64 118367.
## 5 YouTube Spotlight 6495154 18 360842.
## 6 Call of Duty 4224430 41 103035.
## 7 David Dobrik 2673859 9 297095.
## 8 The ACE Family 1714577 9 190509.
## 9 TheAngryGrandpaShow 1314491 8 164311.
## 10 Collins Key 1150210 8 143776.
library(cluster)
data_cluster = videos[,c("views","dislikes",'likes','comment_count')]
sapply(data_cluster , class)
## views dislikes likes comment_count
## "numeric" "numeric" "numeric" "numeric"
library(mice)
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
set.seed(617)
data_cluster = mice::complete(mice(data_cluster))
##
## iter imp variable
## 1 1
## 1 2
## 1 3
## 1 4
## 1 5
## 2 1
## 2 2
## 2 3
## 2 4
## 2 5
## 3 1
## 3 2
## 3 3
## 3 4
## 3 5
## 4 1
## 4 2
## 4 3
## 4 4
## 4 5
## 5 1
## 5 2
## 5 3
## 5 4
## 5 5
hierarchical clustering is not suitable because the dataset is large so we will use k-mean
total within cluster sum of squares
within_ss = sapply(1:10,FUN = function(x){
set.seed(617)
kmeans(x = data_cluster,centers = x,iter.max = 1000,nstart = 25)$tot.withinss})
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2047450)
ggplot(data=data.frame(cluster = 1:10,within_ss),aes(x=cluster,y=within_ss))+
geom_line(col='steelblue',size=1.2)+
geom_point()+
scale_x_continuous(breaks=seq(1,10,1))
ratio plot:the ratio of between cluster sum of squares and total sum of squares for a number of values of k
ratio_ss = sapply(1:10,FUN = function(x) {
set.seed(617)
km = kmeans(x = data_cluster,centers = x,iter.max = 1000,nstart = 25)
km$betweenss/km$totss} )
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2047450)
ggplot(data=data.frame(cluster = 1:10,ratio_ss),aes(x=cluster,y=ratio_ss))+
geom_line(col='steelblue',size=1.2)+
geom_point()+
scale_x_continuous(breaks=seq(1,10,1))
We decide to choose 3 clusters
set.seed(617)
km = kmeans(x = data_cluster,centers = 3,iter.max=10000,nstart=25)
k_segments = km$cluster
table(k_segments)
## k_segments
## 1 2 3
## 1358 39498 93
library(stringr)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
temp = data.frame(cluster = factor(k_segments),
factor1 = fa(data_cluster,nfactors = 2,rotate = 'varimax')$scores[,1],
factor2 = fa(data_cluster,nfactors = 2,rotate = 'varimax')$scores[,2])
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
ggplot(temp,aes(x=factor1,y=factor2,col=cluster))+geom_point()
data2 = cbind(videos,k_segments)
library(dplyr)
data2 %>%
select(views:comment_count,k_segments)%>%
group_by(k_segments)%>%
summarize_all(function(x) round(mean(x,na.rm=T),2))%>%
data.frame()
## k_segments views likes dislikes comment_count
## 1 1 23960331 651853.38 36387.74 68908.13
## 2 2 1368381 47813.78 2057.70 5490.26
## 3 3 108444347 2875072.48 228910.44 381255.40
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.6.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
##
## intersect, setdiff, union
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
data$trending_date = ydm(data$trending_date)
data$publish_date = mdy(data$publish_date)
data$dif_days = as.numeric(data$trending_date-data$publish_date)
data_dif_days_31 = subset(data, dif_days<31)
library(ggplot2)
ggplot(data_dif_days_31,aes(as.factor(dif_days),fill=as.factor(dif_days)))+geom_bar()+guides(fill="none")+labs(title=" Time between published and trending",subtitle="In days")+xlab(NULL)+ylab(NULL)
Converting ‘title’ and ‘description’ from Factor to character type so apply Text Mining Technique
data$description = as.character(data$description)
data$title = as.character(data$title)
Mean of characters in description
mean_char = mean(nchar(data$description)); mean_char
## [1] 1031.495
Median of characters in description
median_char = median(nchar(data$description));median_char
## [1] 827
Mean of characters in title
mean_char = mean(nchar(data$title)); mean_char
## [1] 48.57818
Median of characters in title
median_char = median(nchar(data$title));median_char
## [1] 46
Top 10 words in Title
library(tidytext)
data%>%
unnest_tokens(input = title, output = word)%>%
anti_join(stop_words)%>%
select(word)%>%
group_by(word)%>%
summarize(count = n())%>%
ungroup()%>%
arrange(desc(count))%>%
top_n(10)%>%
ggplot(aes(x=reorder(word,count), y=count, fill=count))+
geom_col()+
xlab('words')+
coord_flip()
## Joining, by = "word"
## Selecting by count
Word cloud for title
library(wordcloud)
wordcloudTitle =
data%>%
group_by(category_id)%>%
unnest_tokens(output=word,input=title)%>%
anti_join(stop_words)%>%
group_by(word)%>%
summarize(freq = n())%>%
arrange(desc(freq))%>%
ungroup()%>%
data.frame()
## Joining, by = "word"
set.seed(617)
wordcloud(words = wordcloudTitle$word,wordcloudTitle$freq,scale=c(2.5,1.0),max.words = 100,colors=brewer.pal(9,"Spectral"))
Binary sentiment with ‘bing’
library(ggthemes)
data%>%
group_by(category_id)%>%
unnest_tokens(output = word, input = title)%>%
inner_join(get_sentiments('bing'))%>%
group_by(sentiment)%>%
count()%>%
ggplot(aes(x=sentiment,y=n,fill=sentiment))+geom_col()+theme_economist()+guides(fill=F)+
coord_flip()
## Joining, by = "word"
comments_disabled and title
data$comments_disabled[which(data$comments_disabled=='FALSE')]=0
## Warning in `[<-.factor`(`*tmp*`, which(data$comments_disabled == "FALSE"), :
## invalid factor level, NA generated
data$comments_disabled[which(data$comments_disabled=='TRUE')]=1
## Warning in `[<-.factor`(`*tmp*`, which(data$comments_disabled == "TRUE"), :
## invalid factor level, NA generated
data %>%
select(category_id,title,comments_disabled)%>%
group_by(category_id)%>%
unnest_tokens(output=word,input=title)%>%
ungroup()%>%
inner_join(get_sentiments('bing'))%>%
group_by(comments_disabled,sentiment)%>%
summarize(n = n())%>%
mutate(proportion = n/sum(n))%>%
ggplot(aes(x=comments_disabled,y=proportion,fill=sentiment))+geom_col()+theme_economist()+coord_flip()
## Joining, by = "word"
ratings_disabled and title
data$ratings_disabled[which(data$ratings_disabled=='FALSE')]=0
## Warning in `[<-.factor`(`*tmp*`, which(data$ratings_disabled == "FALSE"), :
## invalid factor level, NA generated
data$ratings_disabled[which(data$ratings_disabled=='TRUE')]=1
## Warning in `[<-.factor`(`*tmp*`, which(data$ratings_disabled == "TRUE"), :
## invalid factor level, NA generated
data %>%
select(category_id,title,ratings_disabled)%>%
group_by(category_id)%>%
unnest_tokens(output=word,input=title)%>%
ungroup()%>%
inner_join(get_sentiments('bing'))%>%
group_by(ratings_disabled,sentiment)%>%
summarize(n = n())%>%
mutate(proportion = n/sum(n))%>%
ggplot(aes(x=ratings_disabled,y=proportion,fill=sentiment))+geom_col()+theme_economist()+coord_flip()
## Joining, by = "word"
video_error_or_removed and title
data$video_error_or_removed[which(data$video_error_or_removed=='FALSE')]=0
## Warning in `[<-.factor`(`*tmp*`, which(data$video_error_or_removed ==
## "FALSE"), : invalid factor level, NA generated
data$video_error_or_removed[which(data$video_error_or_removed=='TRUE')]=1
## Warning in `[<-.factor`(`*tmp*`, which(data$video_error_or_removed == "TRUE"), :
## invalid factor level, NA generated
data %>%
select(category_id,title,video_error_or_removed)%>%
group_by(category_id)%>%
unnest_tokens(output=word,input=title)%>%
ungroup()%>%
inner_join(get_sentiments('bing'))%>%
group_by(video_error_or_removed,sentiment)%>%
summarize(n = n())%>%
mutate(proportion = n/sum(n))%>%
ggplot(aes(x=video_error_or_removed,y=proportion,fill=sentiment))+geom_col()+theme_economist()+coord_flip()
## Joining, by = "word"
Top 10 words
data%>%
unnest_tokens(input = description, output = word)%>%
anti_join(stop_words)%>%
select(word)%>%
group_by(word)%>%
summarize(count = n())%>%
ungroup()%>%
arrange(desc(count))%>%
top_n(10)%>%
ggplot(aes(x=reorder(word,count), y=count, fill=count))+
geom_col()+
xlab('words')+
coord_flip()
## Joining, by = "word"
## Selecting by count
Word cloud
wordcloudTitle =
data%>%
group_by(category_id)%>%
unnest_tokens(output=word,input=description)%>%
anti_join(stop_words)%>%
group_by(word)%>%
summarize(freq = n())%>%
arrange(desc(freq))%>%
ungroup()%>%
data.frame()
## Joining, by = "word"
set.seed(617)
wordcloud(words = wordcloudTitle$word,wordcloudTitle$freq,scale=c(2.5,1.0),max.words = 100,colors=brewer.pal(9,"Spectral"))
comments_disabled and description
data %>%
select(category_id,description,comments_disabled)%>%
group_by(category_id)%>%
unnest_tokens(output=word,input=description)%>%
ungroup()%>%
inner_join(get_sentiments('bing'))%>%
group_by(comments_disabled,sentiment)%>%
summarize(n = n())%>%
mutate(proportion = n/sum(n))%>%
ggplot(aes(x=comments_disabled,y=proportion,fill=sentiment))+geom_col()+theme_economist()+coord_flip()
## Joining, by = "word"
ratings_disabled and description
data %>%
select(category_id,description,ratings_disabled)%>%
group_by(category_id)%>%
unnest_tokens(output=word,input=description)%>%
ungroup()%>%
inner_join(get_sentiments('bing'))%>%
group_by(ratings_disabled,sentiment)%>%
summarize(n = n())%>%
mutate(proportion = n/sum(n))%>%
ggplot(aes(x=ratings_disabled,y=proportion,fill=sentiment))+geom_col()+theme_economist()+coord_flip()
## Joining, by = "word"
video_error_or_removed and description
#Run for description
data %>%
select(category_id,description,video_error_or_removed)%>%
group_by(category_id)%>%
unnest_tokens(output=word,input=description)%>%
ungroup()%>%
inner_join(get_sentiments('bing'))%>%
group_by(video_error_or_removed,sentiment)%>%
summarize(n = n())%>%
mutate(proportion = n/sum(n))%>%
ggplot(aes(x=video_error_or_removed,y=proportion,fill=sentiment))+geom_col()+theme_economist()+coord_flip()
## Joining, by = "word"
Emotions in Title
data%>%
group_by(category_id)%>%
unnest_tokens(output = word, input = title)%>%
inner_join(get_sentiments('nrc'))%>%
group_by(sentiment)%>%
count()%>%
ggplot(aes(x=reorder(sentiment,X = n),y=n,fill=sentiment))+geom_col()+guides(fill=F)+coord_flip()+theme_wsj()
## Joining, by = "word"
Emotions in description
data%>%
group_by(category_id)%>%
unnest_tokens(output = word, input = description)%>%
inner_join(get_sentiments('nrc'))%>%
group_by(sentiment)%>%
count()%>%
ggplot(aes(x=reorder(sentiment,X = n),y=n,fill=sentiment))+geom_col()+guides(fill=F)+coord_flip()+theme_wsj()
## Joining, by = "word"
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:mice':
##
## complete
wordcloudData =
data%>%
group_by(category_id)%>%
unnest_tokens(output=word,input=title)%>%
anti_join(stop_words)%>%
inner_join(get_sentiments('bing'))%>%
ungroup()%>%
count(sentiment,word,sort=T)%>%
spread(key=sentiment,value = n,fill=0)%>%
data.frame()
## Joining, by = "word"
## Joining, by = "word"
rownames(wordcloudData) = wordcloudData[,'word']
wordcloudData = wordcloudData[,c('positive','negative')]
set.seed(617)
comparison.cloud(term.matrix = wordcloudData,scale = c(2.0,0.5),max.words = 200, rot.per=0)
library(lubridate)
videos$publish_date = as.character(videos$publish_date)
videos$publish_hour = as.character(videos$publish_hour)
videos$publish_datetime = paste(videos$publish_date,videos$publish_hour)
videos$publish_datetime = as.POSIXlt(videos$publish_datetime,format="%m/%d/%Y %H:%M",tz=Sys.timezone())
videos$hour = hour(videos$publish_datetime)
videos$publish_date = as.Date(videos$publish_date, "%m/%d/%Y")
videos$day = weekdays(as.Date(videos$publish_date))
library(ggplot2)
ggplot(videos, aes(x = hour, y = day)) + geom_tile(aes(fill = category_id)) + scale_fill_gradient(name = 'Total Uploads', low = 'white', high = 'navy') + theme(axis.title.y = element_blank())